### Configuration
################################################################################
### R Options
nodeInfo = system(paste0("scontrol show node ", Sys.info()["nodename"]), intern = TRUE)
cpus = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "cpu="))[2], ","))[1])
memInKB = as.integer(unlist(strsplit(unlist(strsplit(nodeInfo[grep("CfgTRES", nodeInfo)], "mem="))[2], "M,"))[1]) * 1024^2

options(stringsAsFactors=FALSE,
        dplyr.summarise.inform=FALSE, 
        future.globals.maxSize=min(memInKB, 20 * 1024^3),
        mc.cores=min(cpus,1),
        future.fork.enable=TRUE, future.plan="multicore",
        future.rng.onMisuse="ignore")

### Fuctions
# Git directory and files to source must be done first, then all helper functions can be sourced
git_files_to_source = c("functions_analysis.R", 
                        "functions_biomart.R", 
                        "functions_degs.R", 
                        "functions_io.R", 
                        "functions_plotting.R", 
                        "functions_util.R")
git_files_to_source = file.path(param$path_to_git, "R", git_files_to_source)
file_exists = purrr::map_lgl(git_files_to_source, file.exists)
if (any(!file_exists)) stop(paste("The following files could not be found:", paste(git_files_to_source[!file_exists], collapse=", "), ". Please check the git directory at '", param$path_to_git, "'.!"))
invisible(purrr::map(git_files_to_source, source))

### Debugging mode
# 'default_debugging' for default, 'terminal_debugger' for debugging without X11, 'print_traceback' for non-interactive sessions 
param$debugging_mode = "default_debugging"
switch (param$debugging_mode, 
        default_debugging=on_error_default_debugging(), 
        terminal_debugger=on_error_start_terminal_debugger(),
        print_traceback=on_error_just_print_traceback(),
        on_error_default_debugging())
### Rmarkdown configuration
################################################################################

### Create output directories
if (!file.exists(file.path(param$path_out, "figures"))) dir.create(file.path(param$path_out, "figures"), recursive=TRUE, showWarnings=FALSE)
if (!file.exists(file.path(param$path_out, "data"))) dir.create(file.path(param$path_out, "data"), recursive=TRUE, showWarnings=FALSE)


### R Options
options(citation_format="pandoc", 
        knitr.table.format="html",
        kableExtra_view_html=TRUE)


### Knitr default options
knitr::opts_chunk$set(echo=TRUE,                     # output code
                      cache=FALSE,                   # do not cache results
                      message=TRUE,                  # show messages
                      warning=TRUE,                  # show warnings
                      results = "hold",              # show results after running whole chunk
                      tidy=FALSE,                    # do not auto-tidy-up code
                      fig.width=10,                  # default fig width in inches
                      class.source='fold-hide',      # by default collapse code blocks
                      dev=c('png', 'svg', 'tiff'),          # create figures in png, tiff, and svg; the first device (png) will be used for HTML output
                      dev.args=list(png = list(type="cairo"),     # png: use cairo - works on cluster
                                    tiff = list(type="cairo", compression = 'zip')),  # tiff: use cairo - works on cluster, transparent background, compression type "zip" is ‘deflate (Adobe-style)’ 
                      dpi=300,                       # figure resolution
                      fig.retina=2,                 # retina multiplier
                      fig.path=paste0(file.path(param$path_out, "figures"), "/")  # Path for figures in png and pdf format (trailing "/" is needed)
)

### Set output hooks
knitr::knit_hooks$set(message=format_message, warning=format_warning)

### Required libraries
library(magrittr)

### Set the bibliographic environment
# Clear previous bibliography
knitcitations::cleanbib()

# If the DOI publication servers cannot be reached, there will be no citations, knitcitations will not write a references.bib file and pandoc will stop. This makes sure that there is always at least one citation.
bib_ref = knitcitations::read.bibtex(file.path(param$path_to_git,"assets","sc_analysis_references.bib"))
invisible(knitcitations::citep(bib_ref))

### Figure heights
# high figures
fig_large_height = 8
# Single figure, landscape
fig_standard_height = 4
# Two plots alongside (e.g. umaps)
fig_standard2_height = 5
# Three plots alongside (e.g. umaps)
fig_standard3_height = 4
# Four plots alongside (e.g. umaps)
fig_standard4_height = 2.5
# Four plots 2x2 (e.g. umaps)
fig_patchwork4_height = fig_standard2_height * 2
# Six plots 2x3 (e.g. umaps)
fig_patchwork6_height = fig_standard3_height * 2
# Eight plots 4x2 (e.g. umaps)
fig_patchwork8_height = fig_standard4_height * 2
# Twelve plots 4x3 (e.g. umaps)
fig_patchwork12_height = fig_standard4_height * 3
# Sixteen plots 4x4 (e.g. umaps)
fig_patchwork16_height = fig_standard4_height * 4
# Load renv
renv::load(file.path(param$path_to_git,"env/basic"))

# Required libraries
library(Seurat)
library(patchwork)
library(magrittr)
library(ggplot2)
library(ggsci)
library(pheatmap)

Introduction

Single cell transcriptomes can be difficult to annotate without extensive knowledge of the underlying biology. Hence, the biological knowledge (defined marker genes and cluster identities) can be propagated from an previously annotated dataset to the test dataset in an automated manner and aid in cluster identification.

Here, we project reference data onto a query object to annotate the cells of the query datasets by cell type label transfer and projecting query cells onto reference UMAPs as described in the Seurat vignette ‘Mapping and annotating query datasets’ (https://satijalab.org/seurat/articles/integration_mapping.html). The reference and the query datasets have to be available as S4 class Seurat objects. While the reference object has to contain normalized data and UMAP reduction, correction of the underlying raw query data is not required.

Be aware, that the output is highly impacted by the quality and contend of the reference dataset!

### Read rds object
################################################################################

### Load Seurat S4 objects 
# Test if file is defined
if (is.null(param$data)) {
  message("Dataset is not specified")
} else {
  # Test if file exists
  if (file.exists(param$data)) {
    # Read object
    message(paste0("Load dataset:", param$data))
    sc = base::readRDS(param$data)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(sc@misc[])) {
      # Retrieve previous parameter settings
      orig_param = sc@misc$parameters
      if ("SCT" %in% names(sc@assays)) {
        if ("scale.data" %in% Layers(sc[["SCT"]])) {
          orig_param$norm = "SCT"
        } else {
          orig_param$norm = "RNA"
        }
      } else {
        orig_param$norm = "RNA"
      }
      
      # Keep some parameter settings from object and project defined
      orig_param_keep = orig_param[c("annot_version", "species")]
      basic_param_keep = param[c("path_to_git", "scriptname", "author", "project_id", "data", "path_out", "file_annot", "file_cc_genes")]
      
      # Integrate parameter
      param = modifyList(x = param, val = orig_param)
      param = modifyList(x = param, val = basic_param_keep)
      param = modifyList(x = param, val = param_advset)
      param = modifyList(x = param, val = orig_param_keep)
    }
    
    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(sc@meta.data[["orig.ident"]])) {
      if (length(unique(sc@meta.data[["orig.ident"]]))!=length(param$col_samples)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(sc, param$col_palette_samples)) 
        sc = sample_colours_out[[1]]
        param$col_samples = sample_colours_out[[2]]
      }
    }
 
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(sc@meta.data[["seurat_clusters"]])) {
      if (length(unique(sc@meta.data[["seurat_clusters"]]))!=length(param$col_clusters)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(sc, param$col_palette_clusters)) 
        sc = cluster_colours_out[[1]]
        param$col_clusters = cluster_colours_out[[2]]
      }
    }

    # Set colors for cell types based on annotation 
    if (!is.null(sc@meta.data[["annotation"]])) {
      if (length(unique(sc@meta.data[["annotation"]]))!=length(param$col_annotation)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(sc, param$col_palette_annotation)) 
        sc = annotation_colours_out[[1]]
        param$col_annotation = annotation_colours_out[[2]]
      }
    }

  sc
  } else {
  message("Dataset does not exist")
  }
}

× (Message)
Load dataset:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/cluster_analysis/data/sc.rds

### Load reference Seurat S4 objects if specified
# Test if file is defined
if (!is.null(param$refdata)) {
  # Test if file exists
  if (file.exists(param$refdata)) {
   # Read object
    message(paste0("Load dataset:", param$refdata)) 
    scR = base::readRDS(param$refdata)
    
    # Transfer original params to loaded object
    if ("parameters" %in% list_names(scR@misc[])) {
      orig_paramR = scR@misc$parameters
      
      if (!is.null(orig_paramR$col_samples)) {
        param$col_samples_ref = orig_paramR$col_samples
      }
      if (!is.null(orig_paramR$col_clusters)) {
        param$col_clusters_ref = orig_paramR$col_clusters
      }
      if (!is.null(orig_paramR$col_annotation)) {
        param$col_annotation_ref = orig_paramR$col_annotation
      }
      param = modifyList(x = param, val = param_advset)
    }
    
    ### Set colors
    # Set sample colors based on orig.ident
    if (!is.null(scR@meta.data[["orig.ident"]])) {
      if (length(unique(scR@meta.data[["orig.ident"]]))!=length(param$col_samples_ref)) {
        message(paste0("No or wrong number of distinct colors for samples provieded. Generate colors using color palette", param$col_palette_samples))
        sample_colours_out = suppressWarnings(SetSampleColours(scR, param$col_palette_samples)) 
        scR = sample_colours_out[[1]]
        param$col_samples_ref = sample_colours_out[[2]]
      }
    }
    
    # Set colors for clusters based on seurat_clusters 
    if (!is.null(scR@meta.data[["seurat_clusters"]])) {
      if (length(unique(scR@meta.data[["seurat_clusters"]]))!=length(param$col_clusters_ref)) {
        message(paste0("No or wrong number of distinct colors for clusters provieded. Generate colors using color palette", param$col_palette_clusters))
        cluster_colours_out = suppressWarnings(SetClusterColours(scR, param$col_palette_clusters)) 
        scR = cluster_colours_out[[1]]
        param$col_clusters_ref = cluster_colours_out[[2]]
      }
    }
    
    # Set colors for cell types based on annotation 
    if (!is.null(scR@meta.data[["annotation"]])) {
      if (length(unique(scR@meta.data[["annotation"]]))!=length(param$col_annotation_ref)) {
        message(paste0("No or wrong number of distinct colors for cell tpye annotation provieded. Generate colors using color palette", param$col_palette_annotation))
        annotation_colours_out = suppressWarnings(SetAnnotationColours(scR, param$col_palette_annotation)) 
        scR = annotation_colours_out[[1]]
        param$col_annotation_ref = annotation_colours_out[[2]]
      }
    }
    
  scR
  } else {
  message("Reference dataset does not exist")
  }
}  

× (Message)
Load dataset:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata_2/cluster_analysis/data/sc.rds

## An object of class Seurat 
## 6439 features across 1078 samples within 1 assay 
## Active assay: RNA (6439 features, 3000 variable features)
##  3 layers present: data, counts, scale.data
##  2 dimensional reductions calculated: pca, umap
## An object of class Seurat 
## 14043 features across 1078 samples within 1 assay 
## Active assay: RNA (14043 features, 3000 variable features)
##  3 layers present: counts, data, scale.data
##  2 dimensional reductions calculated: pca, umap
# Expand colors for cell type annotation to to both datasets
param$col_annotation_all = c(param$col_annotation_ref, 'NA' = "#BA6338FF")


Annotation and mapping of query datasets using Seurat refernce integration

Cell type classification was performed by projecting a reference dataset onto a query object by transfer of cell type labels from the reference to cells of the query. In the next step, the query data are then project onto the UMAP structure of the reference as described in the vignette https://satijalab.org/seurat/articles/integration_mapping.

Transfer of cell type labels

The reference-mapping of the query dataset helps us identify cell types. However, the output, means the annotation of cells of the query dataset, is highly impacted by the quality and content of the reference dataset. If there are cell types that are present in the query dataset that are not represented in the reference, they will project to the most similar cells in the reference dataset. Hence, we perform assessment of the prediction score that each cell obtains representing the strength of the prediction. Each prediction is assigned a score between 0 and 1. We test different thresholds.

# From https://satijalab.org/seurat/articles/integration_mapping.html

### Projection of query data onto the UMAP structure of the reference does only work if model is known.
# If reference object does not contain an umap dimensional reduction object
if (is.null(scR@reductions$umap)) {
scR = Seurat::RunUMAP(scR, dims=1:30, verbose=FALSE, umap.method="uwot", n.neighbors=30, return.model = TRUE)
}

# If RunUMAP() was run without return.model = TRUE, set the model manually with information extracted from the seurat object
if (is.null(scR[["umap"]]@misc$model)) {
  # set embeddings
  scR[["umap"]] <- CreateDimReducObject(embeddings = scR[["umap"]]@cell.embeddings, key = "UMAP_")
  
  # set UMAP models
  umap.model <- list()
  umap.model$n_epochs <- 200
  umap.model$alpha <-1
  umap.model$method <- "umap"
  umap.model$negative_sample_rate <- scR@commands$RunUMAP.RNA.pca$negative.sample.rate 
  umap.model$gamma <- 1
  umap.model$approx_pow <- FALSE
  umap.model$metric$cosine$ndim <- length(scR@commands$RunUMAP.RNA.pca$dims) 
  umap.model$min_dist = scR@commands$RunUMAP.RNA.pca$min.dist
  umap.model$spread = scR@commands$RunUMAP.RNA.pca$spread
  umap.model$n_neighbors = scR@commands$RunUMAP.RNA.pca$n.neighbors
  umap.model$pca_models = list()
  umap.model$spread = scR@commands$RunUMAP.RNA.pca$uwot.sgd
  umap.model$embedding <- scR[["umap"]]@cell.embeddings 
  ab_param <- uwot:::find_ab_params(spread = scR@commands$RunUMAP.RNA.pca$spread, min_dist = scR@commands$RunUMAP.RNA.pca$min.dist) 
  umap.model$a <- ab_param["a"]
  umap.model$b <- ab_param["b"]
  scR[["umap"]]@misc$model <- umap.model
}

# Find anchors between reference and query
DefaultAssay(scR) <- "RNA"
DefaultAssay(sc) <- "RNA"
sc.anchors = FindTransferAnchors(reference = scR, query = sc, dims = 1:30, reference.reduction = "pca")

# MapQuery(), a wrapper of TransferData(), IntegrateEmbeddings(), and ProjectUMAP(), transfers cell type labels and impute the ADT values (TransferData()) and projects the query data onto the UMAP structure of the reference (IntegrateEmbeddings() and ProjectUMAP()). Predicted cell types are saved in a predicted.celltype column.
sc = MapQuery(anchorset = sc.anchors, reference = scR, query = sc, refdata = list(celltype = param$celltype), reference.reduction = "pca", reduction.model = "umap")

# Transfer refdata annotation to an additional column of the reference dataset for later visualisation.  
scR$annot_cells = scR[[param$celltype]]
# Extract numbers predicted celltypes
pred_cells_table = data.frame(table(sc$predicted.celltype))
colnames(pred_cells_table) = c("Cell_type", "0")

# Filter for predicted.celltype.score
pred_score_thresholod = seq(0.1, 1, 0.1)
for (i in pred_score_thresholod) {
 sc_subset <- subset(sc, subset = predicted.celltype.score > i)
 sc$annot_cells = NULL
 sc$annot_cells = sc_subset$predicted.celltype 
 
 # Extract numbers predicted celltypes
 pred_celltypes_filtered_tbl = data.frame(table(sc$annot_cells))
 colnames(pred_celltypes_filtered_tbl) = c("Cell_type", i)
 
 # Combine and print table
 pred_cells_table = merge(pred_cells_table, pred_celltypes_filtered_tbl, by.x = "Cell_type", all.x = TRUE)

}

knitr::kable(pred_cells_table, align="l", caption="Number of annotated cells for each prediction score threshold") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) 
Number of annotated cells for each prediction score threshold
Cell_type 0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
B-cells 185 185 185 185 185 185 185 185 185 184 60
CD4+ T-cells 353 353 353 353 353 353 352 347 339 328 60
CD8+ T-cells 128 128 128 128 127 126 125 122 122 115 28
Monocytes 358 358 358 358 358 358 358 356 355 350 109
NK cells 54 54 54 54 54 54 54 53 52 50 19
tbl = tidyr::pivot_longer(pred_cells_table, cols = 2:12, cols_vary = "fastest", names_to = "Threshold", values_to = "Number")
tbl <- tbl %>% replace(is.na(.), 0)
p = ggplot(tbl) +
  geom_point(aes(x = Threshold, y = Number, color = Cell_type)) +
  theme_bw() +
  scale_color_manual(values=param$col_annotation_ref)  
p

In the following the prediction score threshold of 0.9 was used to exclude unlikely cell type annotations.

# Filter by sub-setting for threshold
sc_subset <- subset(sc, subset = predicted.celltype.score > param$predicted_score_threshold)
pred_celltypes = data.frame(table(sc_subset$predicted.celltype))
pred_celltypes = dplyr::mutate(pred_celltypes, Percent = (Freq/sum(pred_celltypes$Freq)*100))

# Filter by subsetting for cells belonging to a specific cell type annotation
filtered_pred_celltypes = dplyr::filter(pred_celltypes, Percent > param$percent_predicted_cells_threshold)
celltype_filter = as.vector(filtered_pred_celltypes$Var1)
Idents(sc_subset) = "predicted.celltype"
sc_subset <- subset(sc_subset, idents = celltype_filter)

# Transfer subsetting results into sc object as annot_cells variable
sc$annot_cells = NULL
sc$annot_cells = sc_subset$predicted.celltype 

# Print table 
annot_cells = data.frame(table(sc$annot_cells))
colnames(annot_cells) = c("Cell_type", "Number")
unassiged = data.frame("unassigend",(length(colnames(sc))-sum(table(sc$annot_cells))))
colnames(unassiged) = c("Cell_type", "Number")
annot_cells = rbind(annot_cells, unassiged)
annot_cells = dplyr::mutate(annot_cells, Percent = (Number/sum(annot_cells$Number)*100))

knitr::kable(annot_cells, align="l", caption="Number and percentage of annotated cells") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover")) 
Number and percentage of annotated cells
Cell_type Number Percent
B-cells 184 17.068646
CD4+ T-cells 328 30.426716
CD8+ T-cells 115 10.667903
Monocytes 350 32.467532
NK cells 50 4.638219
unassigend 51 4.730983
fig_annot_celltypes = ceiling(length(pred_celltypes)/3) * 4

Visualization of the prediction score of annotated cell types in the query dataset.

# plot cell type prediction scores
DefaultAssay(sc) <- 'prediction.score.celltype'

p <- FeaturePlot(sc,  reduction = "ref.umap", features = celltype_filter, ncol = 3,
                  cols = c(param$col_bg, param$col))
p

# plot cell type prediction scores
p <- FeaturePlot(sc, reduction = param$reduction, features = celltype_filter, ncol = 3,
                  cols = c(param$col_bg, param$col))
p

DefaultAssay(sc) <- "RNA"


Unimodal UMAP Projection

Reference dataset projection

Visualization query cells projected into the UMAP visualization defined by the reference alongside the reference.

p1 = Seurat::DimPlot(scR, reduction = "umap", group.by = "annot_cells", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE, shuffle = TRUE) + 
  AddStyle(title="Reference dataset with annotations", xlab = "UMAP 1", ylab = "UMAP 2") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p2 = Seurat::DimPlot(sc, reduction = "ref.umap", group.by = "annot_cells", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE, shuffle = TRUE) + 
  AddStyle(title="Cells of query dataset projected on reference UMAP", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p = p1 + p2
p

Query dataset projection

Visualization annotated query cells in original UMAP visualization alongside the original query annotation.

p1 = Seurat::DimPlot(sc, reduction = param$reduction, group.by = param$celltype, pt.size=param$pt_size, label = TRUE, cols = param$col_annotation, label.size = 3, repel = TRUE, shuffle = TRUE) + 
  AddStyle(title="Query dataset with original annotation", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p2 = Seurat::DimPlot(sc, reduction = param$reduction, group.by = "annot_cells", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE, shuffle = TRUE) + 
  AddStyle(title="Query dataset with transferred labels", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  NoGrid()
p = p1 + p2
p



Computing a new UMAP visualiztion

Above, we have projected the query cells onto to the reference-derived UMAP. Keeping a consistent visualization can assist with the interpretation of new datasets. However, the limitation of this approach lies also exactly within the predefined cell types and reference-derived UMAP and can mask the presence of new cell types in the query which may be of interest. Hence, if the query dataset contains cell types that are not present in the reference, computing a ‘de novo’ visualization is an important step in interpreting the dataset.

This part follows the example of the vignette https://satijalab.org/seurat/articles/multimodal_reference_mapping.

Annotation with reference dataset

# Slim down a Seurat object to reduces the amount of data in the subsequent merge. 
reference = DietSeurat(scR, counts = FALSE, dimreducs = "pca")
query = DietSeurat(sc, counts = FALSE, dimreducs = "ref.pca")

# Make cell names unique
colnames(reference) = paste("reference", colnames(reference), sep = "_")
colnames(query) = paste("query", colnames(query), sep = "_")

# Select equal amount of pca of both datasets
red_pca = reference[["pca"]][[,1:30]]
reference[["red_pca"]] = CreateDimReducObject(embeddings = red_pca, key = "PC_", assay = DefaultAssay(reference))
red_pca = query[["ref.pca"]][[,1:30]]
query[["red_pca"]] = CreateDimReducObject(embeddings = red_pca, key = "PC_", assay = DefaultAssay(query))

# Merge both datasets
reference$id <- 'reference'
query$id <- 'query'

refquery = base::merge(reference, query)
refquery[["pca"]] = base::merge(reference[["red_pca"]], query[["red_pca"]])
refquery = Seurat::RunUMAP(refquery, reduction = 'pca', dims=1:param$pc_n, verbose=FALSE, umap.method="uwot", n.neighbors=param$umap_k)

# Set sample colors
param$col_samples = c('reference' = "#5DB1DDFF", 'query' = "#802268FF")
p1 = DimPlot(refquery, group.by = "id", shuffle = TRUE, pt.size=param$pt_size, cols = param$col_samples, label = TRUE, label.size = 3, repel = TRUE) + 
  AddStyle(title="Groups", xlab = "UMAP 1", ylab = "UMAP 2") + 
  theme(title = element_text(size = 10)) + 
  NoLegend() + 
  NoGrid()

p2 = DimPlot(refquery, group.by = "annot_cells", shuffle = TRUE, pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE) + 
  AddStyle(title="Annotation with reference dataset", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") + 
  theme(title = element_text(size = 10)) + 
  NoGrid()

p = p1 + p2
p

facet_names <- c(`TRUE` = "Query", `FALSE` = "Reference")

p = DimPlot(refquery, group.by = "annot_cells", shuffle = TRUE, split.by = "id", pt.size=param$pt_size, cols = param$col_annotation_all, label = TRUE, label.size = 3, repel = TRUE) + 
  AddStyle(title="Annotation with reference dataset", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme_classic() +
  theme(title = element_text(size = 10)) +
  NoGrid() +
  facet_wrap(~id %in% c("query"), drop = TRUE, labeller = as_labeller(facet_names))

p

Highlight common cell types B-cells, CD4+ T-cells, CD8+ T-cells, Monocytes, NK cells.

Idents(reference) = "annot_cells"
sc_subset_ref <- subset(reference, idents = celltype_filter)
Idents(query) = "annot_cells"
sc_subset_query <- subset(query, idents = celltype_filter)

p1 = DimPlot(refquery, group.by = param$celltype, shuffle = TRUE, pt.size=param$pt_size, label = TRUE, label.size = 3, repel = TRUE, cells = colnames(reference), cells.highlight = colnames(sc_subset_ref), sizes.highlight = param$pt_size) + 
  AddStyle(title="Reference", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  scale_color_manual(labels = c("Unique cell types","Common cell types"), values = c(param$col_bg, param$col)) +
  NoLegend() +
  NoGrid()

p2 = DimPlot(refquery, group.by = param$celltype, shuffle = TRUE, pt.size=param$pt_size, label = TRUE, label.size = 3, repel = TRUE, cells = colnames(query), cells.highlight = colnames(sc_subset_query), sizes.highlight = param$pt_size) + 
  AddStyle(title="Query", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  scale_color_manual(labels = c("Unique cell types","Common cell types"), values = c(param$col_bg, param$col)) +
  NoGrid()

p = p1 + p2
p


QC of combined umap projection

This section of the report visualizes the above calculated cell cycle scores. Scoring is based on the strategy described in Tirosh et al. (2016)

p1 = DimPlot(refquery, group.by = param$celltype, shuffle = TRUE, pt.size=param$pt_size, label = TRUE, cols = param$col_annotation, label.size = 3, repel = TRUE, cells = colnames(query)) + 
  AddStyle(title="Original query annotation", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "bottom") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p2 = Seurat::DimPlot(refquery, group.by="Phase", pt.size=param$pt_size) + 
  AddStyle(title="Cell cycle phases", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p3 = suppressMessages(Seurat::FeaturePlot(refquery, features="nCount_RNA")) + 
  AddStyle(title="nCount_RNA", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  scale_colour_gradient(low="lightgrey", high=param$col, trans="log10") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p4 = suppressMessages(Seurat::FeaturePlot(refquery, features="nFeature_RNA")) + 
  AddStyle(title="nFeature_RNA", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  scale_colour_gradient(low="lightgrey", high=param$col, trans="log10") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p5 = suppressMessages(Seurat::FeaturePlot(refquery, features="percent_mt", cols=c("lightgrey", param$col))) + 
  AddStyle(title="percent_mt", xlab = "UMAP 1", ylab = "UMAP 2", legend_position = "right") +
  theme(title = element_text(size = 10)) +
  NoGrid()

p = p1 + p2 + plot_spacer() + p3 + p4 + p5 +
  plot_layout(ncol = 3)
p


Compositional analysis

annot_cells_ref = data.frame(table(scR$annot_cells))
colnames(annot_cells_ref) = c("Cell_type", "Number")
annot_cells_ref = dplyr::mutate(annot_cells_ref, Group = "reference")
annot_cells_ref = dplyr::mutate(annot_cells_ref, Percent = (Number/sum(annot_cells_ref$Number)*100))

annot_cells = dplyr::mutate(annot_cells, Group = "query")

cells_tbl = rbind(annot_cells_ref, annot_cells)

p = ggplot(cells_tbl, aes(x = Cell_type, y = Percent, fill = Group)) +
  geom_bar(stat = "identity") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1.05)) +
  scale_fill_manual(values=param$col_samples)

p 

p1 = ggplot(cells_tbl, aes(x = Group, y = Percent, fill = Cell_type )) +
  geom_bar(stat = "identity") +
  theme_classic(base_size = 10) +
  scale_fill_manual(values=param$col_annotation_all)

p2 = ggplot(annot_cells_ref, aes(x = 2, y = Percent, fill = Cell_type )) +
  geom_bar(stat="identity", width=1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = round(Percent,1)), position = position_stack(vjust = 0.5), size = 3, color = "white") +
  theme_void() +
  xlim(0.8, 2.5) +
  scale_fill_manual(values=param$col_annotation_all)

p3 = ggplot(annot_cells, aes(x = 2, y = Percent, fill = Cell_type )) +
  geom_bar(stat="identity", width=1, color = "white") +
  coord_polar("y", start = 0) +
  geom_text(aes(label = round(Percent,1)), position = position_stack(vjust = 0.5), size = 3, color = "white") +
  theme_void() +
  xlim(0.8, 2.5) +
  scale_fill_manual(values=param$col_annotation_all)

p = p1 + p2 + plot_spacer() + p3
p




Data export

# Add experiment details
Seurat::Misc(sc, "experiment") = list(project_id=param$project_id, date=Sys.Date(), species=gsub("_gene_ensembl", "", param$mart_dataset))

# Add parameter
Seurat::Misc(sc, "parameters") = param

# Add technical output (note: cannot use Misc function here)
sc@misc$technical = data.frame(ScrnaseqSessionInfo(param$path_to_git))
### Export seurat object
saveRDS(sc, file=file.path(param$path_out, "data", "sc.rds"))
### Convert data
################################################################################

### Export to Loupe Cell Browser
if (all(param$path_data$type == "10x")) { 
  
  # Export reductions (umap, pca, others)
  for(r in Seurat::Reductions(sc)) {
    write.table(Seurat::Embeddings(sc, reduction=r)[,1:2] %>% as.data.frame() %>% tibble::rownames_to_column(var="Barcode"),
                file=file.path(param$path_out, "data", paste0("Loupe_projection_", r, ".csv")), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
  }
  
  # Export categorical metadata
  loupe_meta = sc@meta.data
  idx_keep = sapply(1:ncol(loupe_meta), function(x) !is.numeric(loupe_meta[,x]))
  write.table(x=loupe_meta[, idx_keep] %>% tibble::rownames_to_column(var="barcode"), 
              file=file.path(param$path_out, "data", "Loupe_metadata.csv"), col.names=TRUE, row.names=FALSE, quote=FALSE, sep=",")
}



### Export as andata object (h5ad file)
# Convert Seurat v5 single cell object to anndata object
# Does not work for SCT at the moment
# However, it would contain counts and data layer
#scCustomize::as.anndata(x = sc, file_path = file.path(param$path_out, "data"), file_name = "sc_anndata.h5ad", assay="RNA", 
#                        main_layer = "data", other_layers = "counts", transer_dimreduc = TRUE, verbose = FALSE)

# scesy only worked for v3 and v4 objects
# Convert to V3/4/Assay structure
sc_v3 = scCustomize::Convert_Assay(seurat_object = sc, convert_to = "V3", assay = "RNA")

# Convert Seurat v3 single cell object to anndata object
adata = sceasy::convertFormat(sc_v3, from="seurat", to="anndata", outFile=NULL, assay=Seurat::DefaultAssay(sc_v3))
# Write to h5ad file
adata$write(file.path(param$path_out, "data", "sc_anndata.h5ad"), compression="gzip")



### Export count matrix
invisible(ExportSeuratAssayData(sc, 
                                dir=file.path(param$path_out, "data", "counts"), 
                                assays=param$assay_raw, 
                                slot="counts",
                                include_cell_metadata_cols=colnames(sc[[]]),
                                metadata_prefix=paste0(param$project_id, ".")))


### Export metadata
openxlsx::write.xlsx(x=sc[[]] %>% tibble::rownames_to_column(var="Barcode"), file=file.path(param$path_out, "data", "cell_metadata.xlsx"), rowNames=FALSE, colNames=TRUE)
Export seurat object

We export the assay data, cell metadata, clustering and visualization.

Result files:
* sc.rds: Seurat object for import into R
* cell_metadata.xlsx: Cell metadata in excel format
* counts folder: Contains raw count matrix of the entire dataset (sparse matrix)

Export as andata object

We export the assay data, cell metadata, clustering and visualization in andata format.

Result file:
* sc.h5ad: H5AD object

Export to Loupe Cell Browser

If all provided datasets are of type “10x”, we export the UMAP 2D visualization, metadata such as the cell clusters, and lists of differentially expressed genes, so you can open and work with these in the Loupe Cell Browser (https://support.10xgenomics.com/single-cell-gene-expression/software/visualization/latest/what-is-loupe-cell-browser).

Result files are:
* Loupe_projection_(umap|pca|…).csv: Seurat UMAP/PCA/… projections for visualization
* Loupe_metadata.csv: Seurat cell meta data including clusters and cell cycle phases

Projections can be imported in Loupe via “Import Projection”, cell meta data via “Import Categories” and gene sets via “Import Lists”.

# Output reference seurat object
saveRDS(scR, file=file.path(param$path_out, "data", "scR.rds"))





Parameters and software versions

The following parameters were used to run the workflow.

Parameters
out = ScrnaseqParamsInfo(params=param)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"), full_width=FALSE, position="left")
Name Value
path_to_git /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis
path_to_basic_settings /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/basic_settings.R
path_to_advanced_settings /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/advanced_settings.R
scriptname scripts/dataset_mapping/dataset_mapping_seurat.Rmd
author kosankem
assay_raw RNA
downsample_cells_equally FALSE
cell_filter nFeature_RNA:20, NA; nCount_RNA:200, 20000; percent_mt:0, 18; Sample1:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18); Sample2:nFeature_RNA=c(20, NA), nCount_RNA=c(200, 20000), percent_mt=c(0, 18)
feature_filter min_counts:1; min_cells:5; Sample1:min_counts=1, min_cells=5; Sample2:min_counts=1, min_cells=5
samples_min_cells 10
norm RNA
cc_remove FALSE
cc_remove_all FALSE
cc_rescore_after_merge TRUE
integrate_samples method:merge; dimensions:30; k_anchor:10; k_weight:100; reference:; integration_function:CCAIntegration
experimental_groups homogene
pc_n 8
cluster_k 20
umap_k 30
cluster_resolution_test 0.5, 0.7, 0.8
cluster_resolution 0.6
marker_padj 0.05
marker_log2FC 1
marker_pct 0.25
enrichr_site Enrichr
enrichr_padj 0.05
col_palette_samples ggsci::pal_igv
col_palette_clusters ggsci::pal_igv
col_palette_annotation ggsci::pal_ucscgb
col #0086b3
col_bg #D3D3D3
pt_size 0.5
liana_methods logfc, natmi, cellphonedb
liana_agg_rank_threshold 0.01
celltype annotation
reduction umap
predicted_score_threshold 0.9
percent_predicted_cells_threshold 0.1
project_id Testdata
data /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/cluster_analysis/data/sc.rds
download_test_datasets download_10x_pbmc_1k_healthyDonor_v3Chemistry
species human
refdata /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata_2/cluster_analysis/data/sc.rds
path_out /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/output/Testdata/dataset_mapping_seurat
debugging_mode default_debugging
mart_dataset hsapiens_gene_ensembl
mt ^MT-
enrichr_dbs GO_Biological_Process_2023, WikiPathway_2023_Human, Azimuth_2023, CellMarker_2024
annotation_dbs BlueprintEncodeData()
annot_version 98
annot_main ensembl=ensembl_gene_id, symbol=external_gene_name, entrez=entrezgene_accession
mart_attributes ensembl_gene_id, external_gene_name, entrezgene_accession, chromosome_name, start_position, end_position, percentage_gene_gc_content, gene_biotype, strand, description
path_reference /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98
reference hsapiens_gene_ensembl.v98.annot.txt
file_annot /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/hsapiens_gene_ensembl.v98.annot.txt
file_cc_genes /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/references/hsapiens_gene_ensembl/98/cell_cycle_markers.xlsx
path_data name:Sample1, Sample2; type:10x, 10x; path:/mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample1/, /mnt/ngsnfs/single_cell_dev/scRNAseq_processing/sc_analysis/data/counts/sample2/
col_samples reference=#5DB1DDFF, query=#802268FF
col_clusters 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF
col_samples_ref sample1=#5050FFFF
col_clusters_ref 1=#5050FFFF, 2=#CE3D32FF, 3=#749B58FF, 4=#F0E685FF, 5=#466983FF, 6=#BA6338FF, 7=#5DB1DDFF, 8=#802268FF
col_annotation_ref Monocytes=#FF0000FF, B-cells=#FF9900FF, CD8+ T-cells=#FFCC00FF, CD4+ T-cells=#00FF00FF, NK cells=#6699FFFF
col_annotation_all Monocytes=#FF0000FF, B-cells=#FF9900FF, CD8+ T-cells=#FFCC00FF, CD4+ T-cells=#00FF00FF, NK cells=#6699FFFF, NA=#BA6338FF

This report was generated using the scripts/dataset_mapping/dataset_mapping_seurat.Rmd script. Software versions were collected at run time.

Software versions
out = ScrnaseqSessionInfo(param$path_to_git)

knitr::kable(out, align="l") %>% 
  kableExtra::kable_styling(bootstrap_options=c("striped", "hover"))
Name Value
Run on: Wed Sep 04 02:28:53 PM 2024
ktrns/scrnaseq 35dd8e070b3e3baff5405ec6fea400c39e724d1c
Container NA
R R version 4.2.1 (2022-06-23)
Platform x86_64-pc-linux-gnu (64-bit)
Operating system Ubuntu 20.04.6 LTS
Host name hpc-rc11
Host OS #138-Ubuntu SMP Wed Jun 22 15:00:31 UTC 2022 (5.4.0-122-generic)
Packages abind1.4-5, AnnotationDbi1.60.2, AnnotationHub3.6.0, ape5.8, assertthat0.2.1, backports1.4.1, basilisk1.17.2, basilisk.utils1.17.2, beachmat2.14.2, beeswarm0.4.0, bibtex0.5.1, Biobase2.58.0, BiocFileCache2.6.1, BiocGenerics0.44.0, BiocManager1.30.22, BiocNeighbors1.16.0, BiocParallel1.32.6, BiocSingular1.14.0, BiocVersion3.16.0, Biostrings2.66.0, bit4.0.5, bit644.0.5, bitops1.0-7, blob1.2.4, bluster1.8.0, bslib0.6.1, cachem1.0.8, Cairo1.6-2, celldex1.8.0, cellranger1.1.0, checkmate2.3.1, circlize0.4.16, cli3.6.2, clue0.3-65, cluster2.1.3, clustree0.5.1, codetools0.2-18, colorspace2.1-0, ComplexHeatmap2.14.0, cowplot1.1.3, crayon1.5.2, curl5.2.1, data.table1.15.2, DBI1.2.2, dbplyr2.2.1, DelayedArray0.24.0, DelayedMatrixStats1.20.0, deldir2.0-4, digest0.6.34, dir.expiry1.6.0, doParallel1.0.17, dotCall641.1-1, dplyr1.1.4, dqrng0.3.2, edgeR3.40.2, ellipsis0.3.2, enrichR3.2, evaluate0.23, ExperimentHub2.6.0, fansi1.0.6, farver2.1.1, fastDummies1.7.3, fastmap1.1.1, filelock1.0.3, fitdistrplus1.1-11, forcats1.0.0, foreach1.5.2, future1.33.1, future.apply1.11.1, generics0.1.3, GenomeInfoDb1.34.9, GenomeInfoDbData1.2.9, GenomicRanges1.50.2, GetoptLong1.0.5, ggbeeswarm0.7.2, ggforce0.4.2, ggplot23.5.0, ggprism1.0.5, ggraph2.2.1, ggrastr1.0.2, ggrepel0.9.5, ggridges0.5.6, ggsci3.0.1, GlobalOptions0.1.2, globals0.16.2, glue1.7.0, goftest1.2-3, graphlayouts1.1.1, gridExtra2.3, gridGraphics0.5-1, gtable0.3.4, highr0.10, hms1.1.3, htmltools0.5.7, htmlwidgets1.6.4, httpuv1.6.14, httr1.4.7, ica1.0-3, igraph2.0.2, interactiveDisplayBase1.36.0, IRanges2.32.0, irlba2.3.5.1, iterators1.0.14, janitor2.2.0, jquerylib0.1.4, jsonlite1.8.8, kableExtra1.4.0, KEGGREST1.38.0, KernSmooth2.23-20, knitcitations1.0.12, knitr1.45, labeling0.4.3, later1.3.2, lattice0.20-45, lazyeval0.2.2, leiden0.4.3.1, liana0.1.14, lifecycle1.0.4, limma3.54.2, listenv0.9.1, lmtest0.9-40, locfit1.5-9.9, logger0.3.0, lubridate1.9.3, magrittr2.0.3, MASS7.3-58, MAST1.27.1, Matrix1.6-5, MatrixGenerics1.10.0, matrixStats1.1.0, memoise2.0.1, metapod1.6.0, mime0.12, miniUI0.1.1.1, munsell0.5.0, nlme3.1-157, OmnipathR3.13.22, openxlsx4.2.5.2, paletteer1.6.0, parallelly1.37.1, patchwork1.2.0, pbapply1.7-2, pheatmap1.0.12, pillar1.9.0, pkgconfig2.0.3, plotly4.10.4, plyr1.8.9, png0.1-8, polyclip1.10-6, prettyunits1.2.0, progress1.2.3, progressr0.14.0, promises1.2.1, purrr1.0.2, R.methodsS31.8.2, R.oo1.26.0, R.utils2.12.3, R62.5.1, ragg1.2.7, RANN2.6.1, rappdirs0.3.3, RColorBrewer1.1-3, Rcpp1.0.12, RcppAnnoy0.0.22, RcppHNSW0.6.0, RCurl1.98-1.14, readr2.1.5, readxl1.4.3, RefManageR1.4.0, rematch22.1.2, remotes2.4.2.1, renv0.16.0, reshape21.4.4, reticulate1.35.0, rjson0.2.21, rlang1.1.3, rmarkdown2.25, ROCR1.0-11, RSpectra0.16-1, RSQLite2.3.5, rstudioapi0.15.0, rsvd1.0.5, Rtsne0.17, rvest1.0.4, S4Vectors0.36.2, sass0.4.8, ScaledMatrix1.6.0, scales1.3.0, scater1.26.1, scattermore1.2, scCustomize2.1.2, sceasy0.0.7, scran1.26.2, sctransform0.4.1, scuttle1.8.4, sessioninfo1.2.2, Seurat5.0.2, SeuratObject5.0.1, shape1.4.6.1, shiny1.8.0, SingleCellExperiment1.20.1, SingleR2.0.0, snakecase0.11.1, sp2.1-3, spam2.10-0, sparseMatrixStats1.10.0, spatstat.data3.0-4, spatstat.explore3.2-6, spatstat.geom3.2-9, spatstat.random3.2-3, spatstat.sparse3.0-3, spatstat.utils3.0-4, statmod1.5.0, stringi1.8.3, stringr1.5.1, SummarizedExperiment1.28.0, survival3.2-13, svglite2.1.3, systemfonts1.0.5, tensor1.5, textshaping0.3.7, tibble3.2.1, tidygraph1.3.1, tidyr1.3.1, tidyselect1.2.0, timechange0.3.0, tweenr2.0.3, tzdb0.4.0, utf81.2.4, uwot0.1.16, vctrs0.6.5, vipor0.4.7, viridis0.6.5, viridisLite0.4.2, withr3.0.0, WriteXLS6.7.0, xfun0.41, XML3.99-0.16.1, xml21.3.6, xtable1.8-4, XVector0.38.0, yaml2.3.8, zip2.3.1, zlibbioc1.44.0, zoo1.8-12


Credits and References

This Workflow was developed as module of the sc_analysis workflow at the Research Core Unit Genomics, Hannover Medical School (Hannover, Germany). Seurat Vignettes were initially used as templates (Hao et al. (2023), Hao et al. (2021)).

# Writes knitcitations references to references.bib file.
knitcitations::write.bibtex(file=file.path(param$path_out, "references.bib"))
Hao, Y., Hao, S., Andersen-Nissen, E., III, W.M.M., Zheng, S., Butler, A., Lee, M.J., Wilk, A.J., Darby, C., Zagar, M., Hoffman, P., Stoeckius, M., Papalexi, E., Mimitou, E.P., Jain, J., Srivastava, A., Stuart, T., Fleming, L.B., Yeung, B., Rogers, A.J., McElrath, J.M., Blish, C.A., Gottardo, R., Smibert, P., Satija, R., 2021. Integrated analysis of multimodal single-cell data. Cell. https://doi.org/10.1016/j.cell.2021.04.048
Hao, Y., Stuart, T., Kowalski, M.H., Choudhary, S., Hoffman, P., Hartman, A., Srivastava, A., Molla, G., Madad, S., Fernandez-Granda, C., Satija, R., 2023. Dictionary learning for integrative, multimodal and scalable single-cell analysis. Nature Biotechnology. https://doi.org/10.1038/s41587-023-01767-y
Tirosh, I., Izar, B., Prakadan, S.M., Wadsworth, M.H., Treacy, D., Trombetta, J.J., Rotem, A., Rodman, C., Lian, C., Murphy, G., Fallahi-Sichani, M., Dutton-Regester, K., Lin, J.-R., Cohen, O., Shah, P., Lu, D., Genshaft, A.S., Hughes, T.K., Ziegler, C.G.K., Kazer, S.W., Gaillard, A., Kolb, K.E., Villani, A.-C., Johannessen, C.M., Andreev, A.Y., Van Allen, E.M., Bertagnolli, M., Sorger, P.K., Sullivan, R.J., Flaherty, K.T., Frederick, D.T., Jan’e-Valbuena, J., Yoon, C.H., Rozenblatt-Rosen, O., Shalek, A.K., Regev, A., Garraway, L.A., 2016. Dissecting the multicellular ecosystem of metastatic melanoma by single-cell RNA-seq. Science 352, 189–196. https://doi.org/10.1126/science.aad0501